static const char* prefixScanKernelsDX11= \
"/*\n"
"		2011 Takahiro Harada\n"
"*/\n"
"\n"
"typedef uint u32;\n"
"\n"
"#define GET_GROUP_IDX groupIdx.x\n"
"#define GET_LOCAL_IDX localIdx.x\n"
"#define GET_GLOBAL_IDX globalIdx.x\n"
"#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()\n"
"\n"
"//	takahiro end\n"
"#define WG_SIZE 128\n"
"\n"
"#define GET_GROUP_SIZE WG_SIZE\n"
"\n"
"\n"
"cbuffer SortCB : register( b0 )\n"
"{\n"
"	int m_numElems;\n"
"	int m_numBlocks;\n"
"	int m_numScanBlocks;\n"
"};\n"
" \n"
"RWStructuredBuffer<uint> dst : register( u0 );\n"
"RWStructuredBuffer<uint> src : register( u1 );\n"
"RWStructuredBuffer<uint> sumBuffer : register( u2 );\n"
"\n"
"\n"
"groupshared u32 ldsData[2048];\n"
"\n"
"u32 ScanExclusive(u32 n, int lIdx, int lSize)\n"
"{\n"
"	u32 blocksum;\n"
"    int offset = 1;\n"
"    for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n"
"    {\n"
"        GROUP_LDS_BARRIER;\n"
"        for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n"
"        {\n"
"            int ai = offset*(2*iIdx+1)-1;\n"
"            int bi = offset*(2*iIdx+2)-1;\n"
"            ldsData[bi] += ldsData[ai];\n"
"        }\n"
"	}\n"
"\n"
"    GROUP_LDS_BARRIER;\n"
"\n"
"    if( lIdx == 0 )\n"
"	{\n"
"		blocksum = ldsData[ n-1 ];\n"
"        ldsData[ n-1 ] = 0;\n"
"	}\n"
"\n"
"	GROUP_LDS_BARRIER;\n"
"\n"
"	offset >>= 1;\n"
"    for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n"
"    {\n"
"        GROUP_LDS_BARRIER;\n"
"        for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n"
"        {\n"
"            int ai = offset*(2*iIdx+1)-1;\n"
"            int bi = offset*(2*iIdx+2)-1;\n"
"            u32 temp = ldsData[ai];\n"
"            ldsData[ai] = ldsData[bi];\n"
"            ldsData[bi] += temp;\n"
"        }\n"
"	}\n"
"	GROUP_LDS_BARRIER;\n"
"\n"
"	return blocksum;\n"
"}\n"
"\n"
"[numthreads(WG_SIZE, 1, 1)]\n"
"void LocalScanKernel(uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID)\n"
"{\n"
"	int gIdx = GET_GLOBAL_IDX;\n"
"	int lIdx = GET_LOCAL_IDX;\n"
"\n"
"	ldsData[2*lIdx]     = ( 2*gIdx < m_numElems )? src[2*gIdx]: 0;\n"
"	ldsData[2*lIdx + 1] = ( 2*gIdx+1 < m_numElems )? src[2*gIdx + 1]: 0;\n"
"\n"
"	u32 sum = ScanExclusive(WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
"\n"
"	if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;\n"
"\n"
"	if( (2*gIdx) < m_numElems )\n"
"    {\n"
"        dst[2*gIdx]     = ldsData[2*lIdx];\n"
"	}\n"
"	if( (2*gIdx + 1) < m_numElems )\n"
"	{\n"
"        dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n"
"    }\n"
"}\n"
"\n"
"[numthreads(WG_SIZE, 1, 1)]\n"
"void TopLevelScanKernel(uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID)\n"
"{\n"
"	int gIdx = GET_GLOBAL_IDX;\n"
"	int lIdx = GET_LOCAL_IDX;\n"
"	int lSize = GET_GROUP_SIZE;\n"
"\n"
"	for(int i=lIdx; i<m_numScanBlocks; i+=lSize )\n"
"	{\n"
"		ldsData[i] = (i<m_numBlocks)? dst[i]:0;\n"
"	}\n"
"\n"
"	GROUP_LDS_BARRIER;\n"
"\n"
"	u32 sum = ScanExclusive(m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
"\n"
"	for(int i=lIdx; i<m_numBlocks; i+=lSize )\n"
"	{\n"
"		dst[i] = ldsData[i];\n"
"	}\n"
"\n"
"	if( gIdx == 0 )\n"
"	{\n"
"		dst[m_numBlocks] = sum;\n"
"	}\n"
"}\n"
"\n"
"\n"
" \n"
"RWStructuredBuffer<uint> blockSum2 : register( u1 );\n"
"\n"
"[numthreads(WG_SIZE, 1, 1)]\n"
"void AddOffsetKernel(uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID)\n"
"{\n"
"	const u32 blockSize = WG_SIZE*2;\n"
"\n"
"	int myIdx = GET_GROUP_IDX+1;\n"
"	int llIdx = GET_LOCAL_IDX;\n"
"\n"
"	u32 iBlockSum = blockSum2[myIdx];\n"
"\n"
"	int endValue = min((myIdx+1)*(blockSize), m_numElems);\n"
"	for(int i=myIdx*blockSize+llIdx; i<endValue; i+=GET_GROUP_SIZE)\n"
"	{\n"
"		dst[i] += iBlockSum;\n"
"	}\n"
"}\n"
"\n"
;
